On cherche à étudier comparer des méthode d’inférence de réseaux (co-expression et régulation).
## corrplot 0.84 loaded
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## number of items read is not a multiple of the number of columns
load(paste0("./GenesCO2_", specie, ".RData"))
load("./normalized.count_At.RData")
# quantification file
data <- read.csv("quantifFiles/QuantifGenes.csv", h = T, sep = ",")
rownames(data) <- data$Gene
genes = which(!(grepl("__", rownames(data))))
not_quant = data[which((grepl("__", rownames(data)))), ]
data = data[genes, grepl("R", colnames(data))]
keep <- rowSums(data) >= 10
data <- data[keep, ]
group <- sapply(colnames(data), getLabel, with.rep = F)
colnames(data) <- sapply(colnames(data), getLabel)
specie = "At"
clusteredGenes <- clustering(sharedBy3, data)****************************************
coseq analysis: Poisson approach & none transformation
K = 2 to 12
Use set.seed() prior to running coseq for reproducible results.
****************************************
Running g = 2 ...
[1] "Initialization: 1"
[1] "Log-like diff: 0"
Running g = 3 ...
[1] "Initialization: 1"
[1] "Log-like diff: 0"
Running g = 4 ...
[1] "Initialization: 1"
[1] "Log-like diff: 9.85664883046411e-11"
Running g = 5 ...
[1] "Initialization: 1"
[1] "Log-like diff: 1.33795765577815e-08"
Running g = 6 ...
[1] "Initialization: 1"
[1] "Log-like diff: 4.44822489953367e-09"
Running g = 7 ...
[1] "Initialization: 1"
[1] "Log-like diff: 0"
Running g = 8 ...
[1] "Initialization: 1"
[1] "Log-like diff: 4.87787588099309e-11"
Running g = 9 ...
[1] "Initialization: 1"
[1] "Log-like diff: 1.4540333381774e-09"
Running g = 10 ...
[1] "Initialization: 1"
[1] "Log-like diff: 8.95090579433599e-09"
Running g = 11 ...
[1] "Initialization: 1"
[1] "Log-like diff: 1.21511166639721e-06"
Running g = 12 ...
[1] "Initialization: 1"
[1] "Log-like diff: 0"
$ICL
$profiles
$boxplots
$probapost_barplots
*************************************************
Model: Poisson
Transformation: none
*************************************************
Clusters fit: 2,3,4,5,6,7,8,9,10,11,12
Clusters with errors: ---
Selected number of clusters via ICL: 11
ICL of selected model: -740101.6
*************************************************
Number of clusters = 11
ICL = -740101.6
*************************************************
Cluster sizes:
Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6 Cluster 7
3 12 6 11 11 17 18
Cluster 8 Cluster 9 Cluster 10 Cluster 11
4 26 21 2
Number of observations with MAP > 0.90 (% of total):
130 (99.24%)
Number of observations with MAP > 0.90 per cluster (% of total per cluster):
Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6 Cluster 7
3 12 6 10 11 17 18
(100%) (100%) (100%) (90.91%) (100%) (100%) (100%)
Cluster 8 Cluster 9 Cluster 10 Cluster 11
4 26 21 2
(100%) (100%) (100%) (100%)
NULL
Model-Based Clustering Using MPLN (Parallelized) Description Performs clustering using mixtures of multivariate Poisson-log normal (MPLN) distribution and model selection using AIC, AIC3, BIC and ICL. Since each component/cluster size (G) is independent from another, all Gs in the range to be tested have been parallelized to run on a seperate core using the parallel R package.
Class: pca dudi
Call: dudi.pca(df = log(data + 0.1), center = TRUE, scale = TRUE, scannf = FALSE,
nf = 4)
Total inertia: 24
Eigenvalues:
Ax1 Ax2 Ax3 Ax4 Ax5
17.2861 4.7060 0.6320 0.5028 0.2615
Projected inertia (%):
Ax1 Ax2 Ax3 Ax4 Ax5
72.025 19.608 2.633 2.095 1.089
Cumulative projected inertia (%):
Ax1 Ax1:2 Ax1:3 Ax1:4 Ax1:5
72.03 91.63 94.27 96.36 97.45
(Only 5 dimensions (out of 24) are shown)
NULL
## PNLModels
groups <- str_split_fixed(colnames(data), "_", 2)[, 1]
co2 <- str_split_fixed(groups, "", 3)[, 1]
nitrate <- factor(str_split_fixed(groups, "", 3)[, 2])
nitrate <- relevel(nitrate, "N")
fer <- factor(str_split_fixed(groups, "", 3)[, 3])
fer = relevel(fer, "F")
covariates <- data.frame(row.names = colnames(data), co2, nitrate, fer)
DEGenes <- sharedBy3
# preparation des données
counts <- round(t(data[DEGenes, ]), 0)
plnData <- prepare_data(counts = counts, covariates = covariates)
PCA_models <- PLNPCA(Abundance ~ 1 + nitrate + fer + co2 + offset(log(Offset)), data = plnData,
ranks = 1:10)
Initialization...
Adjusting 10 PLN models for PCA analysis.
Rank approximation = 1
Rank approximation = 2
Rank approximation = 3
Rank approximation = 4
Rank approximation = 5
Rank approximation = 6
Rank approximation = 7
Rank approximation = 8
Rank approximation = 9
Rank approximation = 10
Post-treatments
DONE!
--------------------------------------------------------
COLLECTION OF 10 POISSON LOGNORMAL MODELS
--------------------------------------------------------
Task: Principal Component Analysis
========================================================
- Ranks considered: from 1 to 10
- Best model (greater BIC): rank = 10 - R2 = 0.97
- Best model (greater ICL): rank = 10 - R2 = 0.97
myPCA_ICL <- getBestModel(PCA_models, "ICL")
plot(myPCA_ICL, ind_cols = groups, var_cols = factor(clusteredGenes[rownames(data[DEGenes,
])]))gridExtra::grid.arrange(plot(myPCA_ICL, ind_cols = groups, map = "individual", plot = FALSE),
plot(myPCA_ICL, var_cols = factor(clusteredGenes[rownames(data[DEGenes, ])]),
map = "variable", plot = FALSE), ncol = 2)
Initialization...
Adjusting 30 PLN with sparse inverse covariance estimation
Joint optimization alternating gradient descent and graphical-lasso
sparsifying penalty = 1.673463
sparsifying penalty = 1.545729
sparsifying penalty = 1.427745
sparsifying penalty = 1.318766
sparsifying penalty = 1.218106
sparsifying penalty = 1.125129
sparsifying penalty = 1.039249
sparsifying penalty = 0.9599238
sparsifying penalty = 0.8866536
sparsifying penalty = 0.8189761
sparsifying penalty = 0.7564644
sparsifying penalty = 0.6987241
sparsifying penalty = 0.6453911
sparsifying penalty = 0.5961289
sparsifying penalty = 0.5506269
sparsifying penalty = 0.508598
sparsifying penalty = 0.4697772
sparsifying penalty = 0.4339195
sparsifying penalty = 0.4007988
sparsifying penalty = 0.3702062
sparsifying penalty = 0.3419487
sparsifying penalty = 0.315848
sparsifying penalty = 0.2917396
sparsifying penalty = 0.2694714
sparsifying penalty = 0.2489028
sparsifying penalty = 0.2299043
sparsifying penalty = 0.2123559
sparsifying penalty = 0.196147
sparsifying penalty = 0.1811752
sparsifying penalty = 0.1673463
Post-treatments
DONE!
Stability Selection for PLNnetwork:
subsampling: ++++++++++++++++++++Poisson Lognormal with sparse inverse covariance (penalty = 0.167)
==================================================================
nb_param loglik BIC ICL R_squared n_edges EBIC pen_loglik
955 -21692.38 -23209.91 -17592.43 1 431 -23852.85 -21788.51
density
0.05
==================================================================
* Useful fields
$model_par, $latent, $var_par, $optim_par
$loglik, $BIC, $ICL, $loglik_vec, $nb_param, $criteria
* Useful S3 methods
print(), coef(), sigma(), vcov(), fitted(), predict(), standard_error()
* Additional fields for sparse network
$EBIC, $density, $penalty
* Additional S3 methods for network
plot.PLNnetworkfit()
V(net_norm)$color <- clusteredGenes[V(net_norm)]
plot.igraph(net_norm, vertex.size = 10, vertex.label.cex = 0.5, edge.width = 0.5)[1] 431
Faut-il normaliser les données avant geneie? (pris en compte dans PLN)
library(GENIE3)
TF <- read.table("TFs_PlnTFDB.txt", h = T, sep = "\t")
TF$AGI <- str_split_fixed(TF$Protein.ID, "\\.", 2)[, 1]
for (thr in seq(0.05, 0.39, by = 0.05)) {
print(thr)
genie(normalized.count, regressors = intersect(TF$AGI, sharedBy3), targets = sharedBy3,
thr = thr)
}[1] 0.05
[1] 0.1
[1] 0.15
[1] 0.2
[1] 0.25
[1] 0.3
[1] 0.35
net <- genie(normalized.count, regressors = intersect(TF$AGI, sharedBy3), targets = sharedBy3,
thr = 0.25)